Sugar and Protein Influence on Recipe Ratings¶
Name(s):Nuojinli Xu, Jiaxin He
Website Link: https://hjxhh030315.github.io/dsc80/
import pandas as pd
import numpy as np
from pathlib import Path
import time
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
pd.options.plotting.backend = 'plotly'
# from dsc80_utils import * # Feel free to uncomment and use this.
from tqdm import tqdm
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
import plotly.io as pio
pio.renderers.default = 'notebook'
%config Application.log_level = 'CRITICAL'
Step 1: Introduction¶
df1 = pd.read_csv("RAW_interactions.csv")
df1
| user_id | recipe_id | date | rating | review | |
|---|---|---|---|---|---|
| 0 | 1293707 | 40893 | 2011-12-21 | 5 | So simple, so delicious! Great for chilly fall... |
| 1 | 126440 | 85009 | 2010-02-27 | 5 | I made the Mexican topping and took it to bunk... |
| 2 | 57222 | 85009 | 2011-10-01 | 5 | Made the cheddar bacon topping, adding a sprin... |
| 3 | 124416 | 120345 | 2011-08-06 | 0 | Just an observation, so I will not rate. I fo... |
| 4 | 2000192946 | 120345 | 2015-05-10 | 2 | This recipe was OVERLY too sweet. I would sta... |
| ... | ... | ... | ... | ... | ... |
| 731922 | 2002357020 | 82303 | 2018-12-05 | 5 | Delicious quick thick chocolate sauce with ing... |
| 731923 | 583662 | 386618 | 2009-09-29 | 5 | These were so delicious! My husband and I tru... |
| 731924 | 157126 | 78003 | 2008-06-23 | 5 | WOW! Sometimes I don't take the time to rate ... |
| 731925 | 53932 | 78003 | 2009-01-11 | 4 | Very good! I used regular port as well. The ... |
| 731926 | 2001868099 | 78003 | 2017-12-18 | 5 | I am so glad I googled and found this here. Th... |
731927 rows × 5 columns
df2 = pd.read_csv("RAW_recipes.csv")
df2
| name | id | minutes | contributor_id | submitted | tags | nutrition | n_steps | steps | description | ingredients | n_ingredients | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 brownies in the world best ever | 333281 | 40 | 985201 | 2008-10-27 | ['60-minutes-or-less', 'time-to-make', 'course... | [138.4, 10.0, 50.0, 3.0, 3.0, 19.0, 6.0] | 10 | ['heat the oven to 350f and arrange the rack i... | these are the most; chocolatey, moist, rich, d... | ['bittersweet chocolate', 'unsalted butter', '... | 9 |
| 1 | 1 in canada chocolate chip cookies | 453467 | 45 | 1848091 | 2011-04-11 | ['60-minutes-or-less', 'time-to-make', 'cuisin... | [595.1, 46.0, 211.0, 22.0, 13.0, 51.0, 26.0] | 12 | ['pre-heat oven the 350 degrees f', 'in a mixi... | this is the recipe that we use at my school ca... | ['white sugar', 'brown sugar', 'salt', 'margar... | 11 |
| 2 | 412 broccoli casserole | 306168 | 40 | 50969 | 2008-05-30 | ['60-minutes-or-less', 'time-to-make', 'course... | [194.8, 20.0, 6.0, 32.0, 22.0, 36.0, 3.0] | 6 | ['preheat oven to 350 degrees', 'spray a 2 qua... | since there are already 411 recipes for brocco... | ['frozen broccoli cuts', 'cream of chicken sou... | 9 |
| 3 | millionaire pound cake | 286009 | 120 | 461724 | 2008-02-12 | ['time-to-make', 'course', 'cuisine', 'prepara... | [878.3, 63.0, 326.0, 13.0, 20.0, 123.0, 39.0] | 7 | ['freheat the oven to 300 degrees', 'grease a ... | why a millionaire pound cake? because it's su... | ['butter', 'sugar', 'eggs', 'all-purpose flour... | 7 |
| 4 | 2000 meatloaf | 475785 | 90 | 2202916 | 2012-03-06 | ['time-to-make', 'course', 'main-ingredient', ... | [267.0, 30.0, 12.0, 12.0, 29.0, 48.0, 2.0] | 17 | ['pan fry bacon , and set aside on a paper tow... | ready, set, cook! special edition contest entr... | ['meatloaf mixture', 'unsmoked bacon', 'goat c... | 13 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 83777 | zydeco soup | 486161 | 60 | 227978 | 2012-08-29 | ['ham', '60-minutes-or-less', 'time-to-make', ... | [415.2, 26.0, 34.0, 26.0, 44.0, 21.0, 15.0] | 7 | ['heat oil in a 4-quart dutch oven', 'add cele... | this is a delicious soup that i originally fou... | ['celery', 'onion', 'green sweet pepper', 'gar... | 22 |
| 83778 | zydeco spice mix | 493372 | 5 | 1500678 | 2013-01-09 | ['15-minutes-or-less', 'time-to-make', 'course... | [14.8, 0.0, 2.0, 58.0, 1.0, 0.0, 1.0] | 1 | ['mix all ingredients together thoroughly'] | this spice mix will make your taste buds dance! | ['paprika', 'salt', 'garlic powder', 'onion po... | 13 |
| 83779 | zydeco ya ya deviled eggs | 308080 | 40 | 37779 | 2008-06-07 | ['60-minutes-or-less', 'time-to-make', 'course... | [59.2, 6.0, 2.0, 3.0, 6.0, 5.0, 0.0] | 7 | ['in a bowl , combine the mashed yolks and may... | deviled eggs, cajun-style | ['hard-cooked eggs', 'mayonnaise', 'dijon must... | 8 |
| 83780 | cookies by design cookies on a stick | 298512 | 29 | 506822 | 2008-04-15 | ['30-minutes-or-less', 'time-to-make', 'course... | [188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0] | 9 | ['place melted butter in a large mixing bowl a... | i've heard of the 'cookies by design' company,... | ['butter', 'eagle brand condensed milk', 'ligh... | 10 |
| 83781 | cookies by design sugar shortbread cookies | 298509 | 20 | 506822 | 2008-04-15 | ['30-minutes-or-less', 'time-to-make', 'course... | [174.9, 14.0, 33.0, 4.0, 4.0, 11.0, 6.0] | 5 | ['whip sugar and shortening in a large bowl , ... | i've heard of the 'cookies by design' company,... | ['granulated sugar', 'shortening', 'eggs', 'fl... | 7 |
83782 rows × 12 columns
df = df1.merge(df2, left_on="recipe_id", right_on= "id", how = "inner")
df = df[df["minutes"] < 600]
df.isna().sum()
user_id 0 recipe_id 0 date 0 rating 0 review 55 name 1 id 0 minutes 0 contributor_id 0 submitted 0 tags 0 nutrition 0 n_steps 0 steps 0 description 113 ingredients 0 n_ingredients 0 dtype: int64
#rating distribution
px.histogram(df, x = "rating")
#minute distribution
px.histogram(df, x = "minutes")
#ingredient distribution
px.histogram(df, x = "n_ingredients")
Step 2: Data Cleaning and Exploratory Data Analysis¶
df_clean = df[["user_id", "recipe_id", "rating", "nutrition", "n_ingredients","minutes"]]
df_clean.loc[:,"nutrition"] = df_clean["nutrition"].apply(lambda x: eval(x))
df_clean.head()
| user_id | recipe_id | rating | nutrition | n_ingredients | minutes | |
|---|---|---|---|---|---|---|
| 0 | 483827 | 306785 | 5 | [95.3, 1.0, 50.0, 16.0, 5.0, 0.0, 7.0] | 8 | 40 |
| 1 | 5060 | 310237 | 5 | [143.5, 5.0, 25.0, 3.0, 10.0, 3.0, 7.0] | 10 | 30 |
| 2 | 935485 | 321038 | 5 | [182.4, 2.0, 50.0, 7.0, 11.0, 1.0, 13.0] | 14 | 22 |
| 3 | 539686 | 321038 | 5 | [182.4, 2.0, 50.0, 7.0, 11.0, 1.0, 13.0] | 14 | 22 |
| 4 | 22174 | 342209 | 4 | [658.2, 45.0, 151.0, 35.0, 24.0, 72.0, 29.0] | 12 | 40 |
df_clean.loc[:,'sugar'] = df_clean["nutrition"].apply(lambda x: x[2])
df_clean.loc[:,'protein'] = df_clean["nutrition"].apply(lambda x: x[4])
df_clean.head()
| user_id | recipe_id | rating | nutrition | n_ingredients | minutes | sugar | protein | |
|---|---|---|---|---|---|---|---|---|
| 0 | 483827 | 306785 | 5 | [95.3, 1.0, 50.0, 16.0, 5.0, 0.0, 7.0] | 8 | 40 | 50.0 | 5.0 |
| 1 | 5060 | 310237 | 5 | [143.5, 5.0, 25.0, 3.0, 10.0, 3.0, 7.0] | 10 | 30 | 25.0 | 10.0 |
| 2 | 935485 | 321038 | 5 | [182.4, 2.0, 50.0, 7.0, 11.0, 1.0, 13.0] | 14 | 22 | 50.0 | 11.0 |
| 3 | 539686 | 321038 | 5 | [182.4, 2.0, 50.0, 7.0, 11.0, 1.0, 13.0] | 14 | 22 | 50.0 | 11.0 |
| 4 | 22174 | 342209 | 4 | [658.2, 45.0, 151.0, 35.0, 24.0, 72.0, 29.0] | 12 | 40 | 151.0 | 24.0 |
df_clean = df_clean.groupby("recipe_id").agg({"rating": "mean", "nutrition": "first", "sugar": "first", "protein": "first", "n_ingredients": "first","minutes":"first" }).reset_index()
df_clean.head()
| recipe_id | rating | nutrition | sugar | protein | n_ingredients | minutes | |
|---|---|---|---|---|---|---|---|
| 0 | 275022 | 3.0 | [386.1, 34.0, 7.0, 24.0, 41.0, 62.0, 8.0] | 7.0 | 41.0 | 7 | 50 |
| 1 | 275024 | 3.0 | [377.1, 18.0, 208.0, 13.0, 13.0, 30.0, 20.0] | 208.0 | 13.0 | 8 | 55 |
| 2 | 275026 | 3.0 | [326.6, 30.0, 12.0, 27.0, 37.0, 51.0, 5.0] | 12.0 | 37.0 | 9 | 45 |
| 3 | 275030 | 5.0 | [577.7, 53.0, 149.0, 19.0, 14.0, 67.0, 21.0] | 149.0 | 14.0 | 9 | 45 |
| 4 | 275032 | 5.0 | [386.9, 0.0, 347.0, 0.0, 1.0, 0.0, 33.0] | 347.0 | 1.0 | 9 | 25 |
Univariate Analysis¶
As part of our preliminary data exploration, we visualized the distribution of sugar content across all recipes using a box plot, as shown above. These values were determined to be outliers, significantly deviating from the typical range of sugar content observed in the majority of recipes. To ensure the robustness of our predictive modeling and subsequent analyses, we decided to exclude these outlier values from our dataset. This step helps in focusing our analysis on more typical instances, ensuring more generalizable and relevant insights.
fig = px.box(
df_clean,
x='sugar',
title='Box Plot of Sugar Content in Recipes'
)
fig.update_layout(
xaxis_title='Sugar (grams)',
yaxis_title='',
)
fig.show()
Interpretation of the Adjusted Box Plot Main Box (IQR): The box represents the interquartile range (IQR), which contains the middle 50% of the data. This indicates where the bulk of your recipes' sugar content lies. Whiskers: Extend to the smallest and largest values that are not considered outliers. In this plot, it looks like the upper whisker ends just below 200 grams, suggesting that any values beyond this point are relatively rare. Outliers: The absence of points beyond the whiskers in this plot simplifies the interpretation, emphasizing that very few, if any, recipes exceed the 300-gram threshold.
df_clean = df_clean[df_clean['sugar'] <= 300]
fig = px.box(
df_clean,
x='sugar',
title='Box Plot of Sugar Content in Recipes'
)
fig.update_layout(
xaxis_title='Sugar (grams)',
yaxis_title='',
)
fig.show()
Our analysis of protein content in recipes also led to the establishment of a cutoff at 200 grams. The decision mirrors our approach with sugar content, aimed at excluding atypical, high-protein recipes from our analysis. This threshold was based on a detailed review of the protein content distribution, where values exceeding 200 grams were identified as outliers. Such high-protein recipes are uncommon in regular diets and could skew our understanding of factors influencing recipe ratings.
fig = px.box(
df_clean,
x='protein',
title='Box Plot of Protein Content in Recipes'
)
fig.update_layout(
xaxis_title='Protein (grams)',
yaxis_title='',
)
fig.show()
df_clean = df_clean[df_clean['protein'] <=200]
fig = px.box(
df_clean,
x='n_ingredients',
title='Box Plot of Number of Ingredients in Recipes'
)
fig.update_layout(
xaxis_title='Number',
yaxis_title='',
xaxis=dict(range=[0, 50])
)
fig.show()
Bivariate Analysis¶
df_graph =df_clean.groupby('sugar').mean(numeric_only=True).reset_index()
fig = px.scatter(df_graph, x="sugar", y='rating', title="Relation between sugar and rating", labels={"sugar": "sugar (grams)", "rating": "Avg rating"})
fig.update_layout(title={'x': 0.5, 'xanchor': 'center'})
Interpreting the Scatter Plot¶
Distribution Trends: The scatter plot reveals that as sugar content varies, the ratings mostly cluster between 4.2 and 4.6. There doesn't appear to be a clear trend or pattern indicating a strong linear relationship between sugar content and average ratings, as the data points are quite spread out horizontally.
Potential Correlation: The lack of a visible upward or downward trend suggests that there may not be a strong direct correlation between sugar content and recipe ratings. However, this does not rule out other forms of relationship or underlying factors that could influence these variables.
Outlier Consideration: Even with the sugar cutoff at 300 grams, the plot shows some spread in ratings at various sugar levels, but there are no extreme outliers in sugar content that distort the analysis. This supports your decision to limit sugar content to 300 grams in your analysis.
df_graph = df_clean.groupby('protein').mean(numeric_only=True).reset_index()
fig = px.scatter(df_graph, x="sugar", y="rating",
title="Relation between Sugar and Rating",
labels={"sugar": "Sugar (grams)", "rating": "Average Rating"},)
fig.update_layout(title={'x': 0.5, 'xanchor': 'center'})
fig.show()
Observations from the Scatter Plot
Cluster of Data: The bulk of data points are clustered around lower protein levels, particularly below 40 grams of protein. Most of these points have ratings between approximately 4.2 and 4.6.
Sparse Data at Higher Protein Levels: There are fewer data points as protein content increases. It's noticeable that recipes with very high protein content (above 80 grams) are sparse and don't clearly indicate a consistent trend in ratings.
No Strong Correlation: Similar to the sugar analysis, there does not appear to be a strong or clear linear relationship between protein content and ratings. The ratings fluctuate across the range of protein content without a discernible pattern indicating that higher or lower protein directly influences higher ratings.
Outliers: There are a few data points at very high protein levels with a wide range of ratings. These points do not provide enough information to draw substantial conclusions due to their scarcity.
Interesting Aggregates¶
def rating_category(rating):
if rating < 3.5:
return 'Low'
elif rating <= 4.5:
return 'Medium'
else:
return 'High'
df_clean['rating_category'] = df_clean['rating'].apply(rating_category)
pivot_table = df_clean.pivot_table(values=['sugar', 'protein'], index='rating_category', aggfunc='mean')
pivot_table
| protein | sugar | |
|---|---|---|
| rating_category | ||
| High | 29.775338 | 43.262602 |
| Low | 30.634035 | 45.691471 |
| Medium | 32.456278 | 40.633536 |
median_sugar = df_clean['sugar'].median()
df_clean = df_clean[df_clean['sugar'] < 300]
median_sugar
np.float64(22.0)
Step 3: Assessment of Missingness¶
Assessment of Missing Data¶
Upon reviewing the dataset, particularly the ratings column, it appears that recipes with higher sugar content have a higher incidence of missing ratings. This observation leads to the hypothesis that users may be less inclined to rate recipes perceived as unhealthy (high sugar), possibly skewing our analysis if not addressed. Further investigation and additional data collection might be required to fully understand this pattern and adjust for potential biases.
df.isna().sum(axis=0)
user_id 0 recipe_id 0 date 0 rating 0 review 55 name 1 id 0 minutes 0 contributor_id 0 submitted 0 tags 0 nutrition 0 n_steps 0 steps 0 description 113 ingredients 0 n_ingredients 0 dtype: int64
Review (55 missing entries):
Potential NMAR Situation: If reviews are more likely to be missing for certain types of recipes (e.g., less popular or more complex recipes), this missingness might be NMAR. For instance, users might choose not to leave a review for recipes they didn’t enjoy or found difficult to make.
first = df[df['review'].isna()]['n_steps'].mean()
second = df[df['review'].notna()]['n_steps'].mean()
obs = abs(first - second)
obs
np.float64(3.8720802340512925)
result = np.array([])
for i in tqdm(range (1000)):
missingness_df = df.assign(shffuled = np.random.permutation(df['n_steps']))
a = missingness_df[missingness_df['review'].isna()]['shffuled'].mean()
b = missingness_df[missingness_df['review'].notna()]['shffuled'].mean()
result = np.append(result, abs(a-b))
100%|██████████| 1000/1000 [02:12<00:00, 7.52it/s]
(result > obs).mean()
np.float64(0.0)
fig = px.histogram(result,
nbins=30,
title='Distribution Absolute Difference in Mean under Permutation test',
histnorm='probability')
fig.add_vline(
x=obs,
line_color="red",
annotation_text="Observed Stats",
annotation_position="top right"
)
fig.update_layout(
showlegend=False,
xaxis_title="Absolute Difference in Means",
yaxis_title="Frequency"
)
fig.write_html('./imgs/missingness.html', include_plotlyjs='cdn')
fig.show()
Step 4: Hypothesis Testing¶
Null Hypothesis (H₀)¶
The null hypothesis states that there is no difference in the variance of recipe ratings between high-sugar recipes (more than 150 grams of sugar) and low-sugar recipes (150 grams of sugar or less). This implies that the sugar content does not affect the variability in how recipes are rated.
H0: Var(ratings|sugar >150) = Var(ratings|sugar<=150)
Alternative Hypothesis (H₁)¶
The alternative hypothesis contends that there is a difference in the variance of ratings between high-sugar and low-sugar recipes. This suggests that the amount of sugar in a recipe does influence the variability in ratings, possibly due to varying preferences or perceptions among users about sugar content.
H1: Var(ratings|sugar >150) not = Var(ratings|sugar<=150)
Conducting multiple permutations to simulate the distribution of differences in variance under the null hypothesis (that sugar content does not affect the variance of ratings).
np.random.seed(int(time.time()))
result = np.array([])
for i in tqdm(range (100)):
shuffled = df_clean.assign(shuffle = np.random.permutation(df_clean['sugar']))
a = shuffled[shuffled['shuffle'] > 150]['rating'].var()
b = shuffled[shuffled['shuffle'] <= 150]['rating'].var()
diff = abs(a-b)
result = np.append(result,diff)
100%|██████████| 100/100 [00:01<00:00, 59.58it/s]
Test statistic: Absolute difference in Variances
- Test Statistic = |Var_high sugar - Var_low sugar|
observed_diff = abs(
df_clean[df_clean['sugar'] > 150]['rating'].var() -
df_clean[df_clean['sugar'] <= 150]['rating'].var()
)
diffs = np.array([])
for _ in tqdm(range(1000)):
shuffled_sugar = np.random.permutation(df_clean['sugar'])
high_sugar_var = df_clean[shuffled_sugar > 150]['rating'].var()
low_sugar_var = df_clean[shuffled_sugar <= 150]['rating'].var()
diff = abs(high_sugar_var - low_sugar_var)
diffs = np.append(diffs,diff)
p_value = (diffs >= observed_diff).mean()
p_value
100%|██████████| 1000/1000 [00:06<00:00, 160.11it/s]
np.float64(0.0)
fig = px.histogram(diffs,
nbins=30,
title='Permutation Test for Variance Differences in Recipe Ratings',
histnorm='probability',
range_x=[0,0.3])
fig.add_vline(
observed_diff,
line_color='red',
annotation_text = 'Observed Stats ',
annotation_position ='top left'
)
fig.update_layout(
showlegend=False,
xaxis_title='Difference in Variance',
yaxis_title='Frequency',
)
fig.show()
fig.write_html('./imgs/hypo_test.html', include_plotlyjs='cdn')
Step 5: Framing a Prediction Problem¶
Predict the rating of a recipe based on its nutritional content (sugar, protein) and complexity (number of ingredients)
Choice Justification: RMSE will provide a clear measure of how off the predictions are, in the same units as the ratings themselves. This metric is particularly effective because it penalizes larger errors more severely, aligning well with the importance of accurate rating predictions.
Step 6: Baseline Model¶
Multiple Linear Regression with feature enginnering
Standarize sugar, protein and quantile n_ingredient
X = df_clean[['sugar', 'protein', 'n_ingredients']]
y = df_clean['rating']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
preproc = ColumnTransformer(
transformers=[
('std', StandardScaler(), ['sugar', 'protein']),
],
remainder="drop"
)
pl = Pipeline([
('preproc', preproc),
('lr', LinearRegression())
])
pl.fit(X_train, y_train)
Pipeline(steps=[('preproc',
ColumnTransformer(transformers=[('std', StandardScaler(),
['sugar', 'protein'])])),
('lr', LinearRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preproc',
ColumnTransformer(transformers=[('std', StandardScaler(),
['sugar', 'protein'])])),
('lr', LinearRegression())])ColumnTransformer(transformers=[('std', StandardScaler(),
['sugar', 'protein'])])['sugar', 'protein']
StandardScaler()
LinearRegression()
def rmse(actual, pred):
return np.sqrt(np.mean((actual-pred)**2))
y_train_pred = pl.predict(X_train)
y_test_pred = pl.predict(X_test)
pd.DataFrame(
{
"Train": rmse(y_train,y_train_pred),
"Test": rmse(y_test, y_test_pred)
},
index = ['RMSE']
).T
| RMSE | |
|---|---|
| Train | 1.081129 |
| Test | 1.067257 |
Step 7: Final Model¶
X = df_clean[['sugar', 'protein', 'n_ingredients']]
y = df_clean['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
preproc = ColumnTransformer(
transformers=[
('num', Pipeline([
#('scaler', StandardScaler()),
('poly', PolynomialFeatures(degree=3))
]), ['sugar', 'protein']),
],
remainder='passthrough'
)
model = Pipeline([
('preproc', preproc),
('rf', RandomForestRegressor())
])
param_grid = {
'rf__n_estimators': [50, 100, 200],
'rf__max_depth': [ 2, 3, 5 ],
'rf__min_samples_split': [2 ,3, 5],
'rf__max_features': ['sqrt', None]
}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
Fitting 5 folds for each of 54 candidates, totalling 270 fits
{'rf__max_depth': 3, 'rf__max_features': 'sqrt', 'rf__min_samples_split': 3, 'rf__n_estimators': 200}
y_train_pred = grid_search.predict(X_train)
y_test_pred = grid_search.predict(X_test)
pd.DataFrame({
'Train' : rmse(y_train_pred, y_train),
'Test': rmse(y_test_pred, y_test)
}, index=['RMSE']).T
| RMSE | |
|---|---|
| Train | 1.077796 |
| Test | 1.076795 |
Step 8: Fairness Analysis¶
We conduct a fairness analysis on our model created for the previous section. We divide the dataset into two groups based on the number of minutes it takes to finish the recipe
less_time = df_clean.query("minutes <= 30")
more_time = df_clean.query("minutes > 30")
less_pred = grid_search.predict(less_time[['sugar', 'protein', 'n_ingredients']])
less_rmse = rmse(less_time['rating'], less_pred)
more_pred = grid_search.predict(more_time[['sugar', 'protein', 'n_ingredients']])
more_rmse = rmse(more_time['rating'], more_pred)
obs = abs(less_rmse-more_rmse)
obs
np.float64(0.1308938272092144)
result = np.array([])
for _ in tqdm(range(1000)):
shuffled = df_clean.assign(shuffle=np.random.permutation(df_clean['minutes']))
less_time = shuffled.query("shuffle <= 30")
more_time = shuffled.query("shuffle > 30")
less_pred = grid_search.predict(less_time[['sugar', 'protein', 'n_ingredients']])
less_rmse = rmse(less_time['rating'], less_pred)
more_pred = grid_search.predict(more_time[['sugar', 'protein', 'n_ingredients']])
more_rmse = rmse(more_time['rating'], more_pred)
stats = abs(less_rmse-more_rmse)
result = np.append(result, stats)
100%|██████████| 1000/1000 [03:10<00:00, 5.24it/s]
(result > obs).mean()
np.float64(0.0)
fig = px.histogram(result,
nbins=30,
title='Absolute Difference in Prediction RMSE',
histnorm='probability')
fig.add_vline(
x=obs,
line_color="red",
annotation_text="Observed Stats",
annotation_position="top right"
)
fig.update_layout(
showlegend=False,
xaxis_title="Absolute Difference in Prediction RMSE",
yaxis_title="Frequency"
)
fig.show()